# -*-Python-*-

# Talking-heads version of t5.1.1.large.gin
# https://arxiv.org/abs/2003.02436
# 32 talking heads with d_kv=32
# No model-parallelism, which would probably be very slow with talking-heads
# This probably only runs on TPUv3, not TPUv2 due to memory constraints.

include 'models/t5.1.th.base.gin'

d_model = 1024
num_layers = 24
d_ff = 2816
num_heads = 32
